Visual Analytics project to analyze and discovery insights of video game sales in recent years, with the high-level API plotly.express.
# Load the Pandas libraries
import pandas as pd
# Load Plot libraries
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
The first step is to load the dataset into a pandas dataframe. Here you can see the dataset.
dataURL = "../data/vgsales_2020.csv"
raw_data = pd.read_csv(dataURL)
raw_data
Now the basic statistics of the numeric fields are shown, to have a quick understanding of the behavior of the data.
raw_data.describe()
Important Note: Sales will be grouped by release year of game, not by real sale date as we don't have such historical data.
# Total sales
gd_sales = raw_data.groupby(["Year"]).sum()
gd_sales.reset_index(inplace=True)
# Plot global trend
fig = px.line(gd_sales, x="Year", y="Global_Sales")
fig.add_shape(dict(type="line", x0=2008, y0=0, x1=2008, y1=700, line=dict(color="RoyalBlue", width=2, dash="dot")))
fig.update_layout(height=400)
fig.update_xaxes(title_text="Release Year")
fig.update_yaxes(title_text="# Global Sales")
fig.show()
Now we can plot the top 50 best-selling video games in the world.
# Data
top_games = 50
raw_data.head(10)
# Plot the best-selling video games, colored by Publisher
fig = px.bar(raw_data.head(top_games), x = 'Global_Sales', y = 'Name', color='Publisher',
orientation='h', hover_data=["Platform"])
fig.update_layout(yaxis={'categoryorder':'total ascending'}, showlegend=True)
fig.update_layout(height=800, title_text="Top 50 Best-Selling Video Games")
fig.update_xaxes(title_text="# Global Sales")
fig.update_yaxes(title_text="")
fig.show()
# Grouped data
gd = raw_data.groupby(['Platform', 'Publisher']).sum()
gd.reset_index(inplace=True)
gd.head(10)
# Plot Video Game Sales grouped by Platform
fig = px.treemap(gd, path=['Platform', 'Publisher'], values='Global_Sales')
fig.show()
# Plot Video Game Sales grouped by Publisher
fig = px.treemap(gd, path=['Publisher', 'Platform'], values='Global_Sales')
fig.show()
Important Note: Sales will be grouped by release year of game, not by real sale date as we don't have such historical data.
top_companies = 10
# Top 10 Companies
gd = raw_data.groupby(['Publisher']).sum()
gd = gd.sort_values(by='Global_Sales', ascending=False)
top_companies = list(gd.head(top_companies).index)
top_companies
# Grouped data
gd = raw_data[raw_data["Publisher"].isin(top_companies)].groupby(['Year', 'Publisher']).sum()
gd = gd.sort_values(by='Year', ascending=True)
gd.reset_index(inplace=True)
gd.head(10)
Plotting Sales Trends of Top 10 Publishers
fig = px.line(gd, x="Year", y="Global_Sales", color='Publisher')
fig.update_layout(title_text="Sales Trends of Top 10 Publishers")
fig.update_xaxes(title_text="Release Year")
fig.update_yaxes(title_text="# Global Sales")
fig.show()
This multi-line chart confirms the insights obtained in point 4.
Regarding Platform and Genre from 2013.
# Parallel Categories Diagram
fig = px.parallel_categories(raw_data.query("Year >= 2013"), dimensions=["Platform", "Genre"])
fig.show()
# Top 10 Companies
gd = raw_data.groupby(['Year', 'Genre']).sum()
gd.reset_index(inplace=True)
gd
# Parallel Categories Diagram
fig = px.area(raw_data, x='Year', y='Global_Sales', color='Genre')
fig.update_layout(title_text="Evolution of Video Game Sales by Genre")
fig.update_xaxes(title_text="Release Year")
fig.update_yaxes(title_text="# Global Sales")
fig.show()
# Cook the data
gd_1980s = raw_data.query("Year>=1980 and Year<1990")[["Genre", "Global_Sales"]].groupby(['Genre']).sum()
gd_1980s.reset_index(inplace=True)
gd_1990s = raw_data.query("Year>=1900 and Year<2000")[["Genre", "Global_Sales"]].groupby(['Genre']).sum()
gd_1990s.reset_index(inplace=True)
gd_2000s = raw_data.query("Year>=2000 and Year<2010")[["Genre", "Global_Sales"]].groupby(['Genre']).sum()
gd_2000s.reset_index(inplace=True)
gd_2010s = raw_data.query("Year>=2010")[["Genre", "Global_Sales"]].groupby(['Genre']).sum()
gd_2010s.reset_index(inplace=True)
# Create subplots: use 'domain' type for Pie subplot
labels = ["US", "China", "European Union", "Russian Federation", "Brazil", "India", "Rest of World"]
fig = make_subplots(rows=2, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=gd_1980s["Genre"], values=gd_1980s["Global_Sales"], name="1980s", title="1980s", hole=.3), 1, 1)
fig.add_trace(go.Pie(labels=gd_1990s["Genre"], values=gd_1990s["Global_Sales"], name="1990s", title="1990s", hole=.3), 1, 2)
fig.add_trace(go.Pie(labels=gd_2000s["Genre"], values=gd_2000s["Global_Sales"], name="2000s", title="2000s", hole=.3), 2, 1)
fig.add_trace(go.Pie(labels=gd_2010s["Genre"], values=gd_2010s["Global_Sales"], name="2010s", title="2010s", hole=.3), 2, 2)
fig.update_layout(height=800, title_text="Video Game Sales by Genre by Decades")
fig.show()
Finally, how the video game genre trend has changed in each decade: